DSAN 5100 Hypothesis Testing

Author

Emily Mitchum

Imports

library(stats)
library(ggplot2)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Read in Clean Data

earnings_df <-read.csv('/Users/Emi/Library/CloudStorage/GoogleDrive-emilymitchum265@gmail.com/Shared drives/DSAN/DSAN 5100/Final Project/vocational_school_employment_outcomes/Cleaned data/Cleaned_quarterly_earnings_data.csv')
head(earnings_df)
        date education_level median_weekly_earnings
1 2019-01-01         ba_only                   1213
2 2019-04-01         ba_only                   1236
3 2019-07-01         ba_only                   1281
4 2019-10-01         ba_only                   1259
5 2020-01-01         ba_only                   1263
6 2020-04-01         ba_only                   1303
tuition_df <- read.csv('/Users/Emi/Library/CloudStorage/GoogleDrive-emilymitchum265@gmail.com/Shared drives/DSAN/DSAN 5100/Final Project/vocational_school_employment_outcomes/Cleaned data/Cleaned_tuition_data.csv')
head(tuition_df)
  Academic.Year average_tuition     degree_granting year_parsed
1       2019-20           15747 Non-Degree Granting  2019-01-01
2       2019-20           25447     Degree Granting  2019-01-01
3       2020-21           26084     Degree Granting  2020-01-01
4       2020-21           15755 Non-Degree Granting  2020-01-01
5       2021-22           26572     Degree Granting  2021-01-01
6       2021-22           15978 Non-Degree Granting  2021-01-01
  public_or_private
1            public
2            public
3            public
4            public
5            public
6            public
emp_ratio_df <- read.csv('/Users/Emi/Library/CloudStorage/GoogleDrive-emilymitchum265@gmail.com/Shared drives/DSAN/DSAN 5100/Final Project/vocational_school_employment_outcomes/Cleaned data/Cleaned_monthly_emp_pop_ratio.csv')
head(emp_ratio_df)
        date education_level employment_population_ratio
1 2019-01-01         ba_plus                        72.1
2 2019-02-01         ba_plus                        72.1
3 2019-03-01         ba_plus                        72.2
4 2019-04-01         ba_plus                        72.3
5 2019-05-01         ba_plus                        72.2
6 2019-06-01         ba_plus                        72.3
unemp_df <- read.csv('/Users/Emi/Library/CloudStorage/GoogleDrive-emilymitchum265@gmail.com/Shared drives/DSAN/DSAN 5100/Final Project/vocational_school_employment_outcomes/Cleaned data/Cleaned_monthly_unemployment.csv')
head(unemp_df)
        date    education_level unemployment_rate
1 2019-01-01 some_college_assoc               3.5
2 2019-02-01 some_college_assoc               3.2
3 2019-03-01 some_college_assoc               3.4
4 2019-04-01 some_college_assoc               3.1
5 2019-05-01 some_college_assoc               2.7
6 2019-06-01 some_college_assoc               3.0

ANOVA on Quarterly Median Weekly Earnings Data

# Correct data types before ANOVA
earnings_df <- earnings_df %>%
  mutate(
    year = factor(format(as.Date(date), "%Y")),
    education_level = factor(education_level)
  )

tuition_df <- tuition_df |> mutate(year = factor(format(as.Date(year_parsed), "%Y")))

Visual Analysis of Earnings by Education Level

library(ggplot2)

ggplot(earnings_df, aes(
  x = education_level,
  y = median_weekly_earnings,
  fill = education_level
)) +
  geom_boxplot(alpha = 0.85, outlier.color = "gray40") +
  scale_fill_brewer(palette = "Set2") +   # cleaner, professional palette
  labs(
    title = "Median Weekly Earnings by Education Level",
    x = "Education Level",
    y = "Median Weekly Earnings"
  ) +
  theme_minimal(base_size = 16) +         # larger base text for slides
  theme(
    text = element_text(family = "Helvetica", face = "bold"),
    plot.title = element_text(size = 22, face = "bold", hjust = 0.5),
    axis.title.x = element_text(size = 18, face = "bold"),
    axis.title.y = element_text(size = 18, face = "bold"),
    axis.text.x = element_text(size = 14, face = "bold", angle = 20, hjust = 1),
    axis.text.y = element_text(size = 14, face = "bold"),
    legend.position = "none",             # removes redundant legend
    panel.grid.major.x = element_blank()  # cleaner slide aesthetic
  )

One-Way ANOVA (Ignoring Time)

Check for equal variances

library(car)
Loading required package: carData

Attaching package: 'car'
The following object is masked from 'package:dplyr':

    recode
leveneTest(median_weekly_earnings ~ education_level, data = earnings_df)
Levene's Test for Homogeneity of Variance (center = median)
      Df F value   Pr(>F)   
group  2  6.9734 0.001669 **
      75                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
leveneTest(median_weekly_earnings ~ education_level * year, data = earnings_df)
Levene's Test for Homogeneity of Variance (center = median)
      Df F value Pr(>F)
group 20  1.2631 0.2415
      57               
oneway <- aov(median_weekly_earnings ~ education_level, data = earnings_df)
summary(oneway)
                Df  Sum Sq Mean Sq F value Pr(>F)    
education_level  2 4505007 2252503   264.3 <2e-16 ***
Residuals       75  639267    8524                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The one-way ANOVA test revealed a highly significant effect of education level on median weekly earnings. This indicates that mean earnings differ substantially across the three education groups.

# Welch's ANOVA
oneway.test(median_weekly_earnings ~ education_level, data = earnings_df)

    One-way analysis of means (not assuming equal variances)

data:  median_weekly_earnings and education_level
F = 207.99, num df = 2.000, denom df = 48.517, p-value < 2.2e-16

Two-Way ANOVA (Analyzing Effect of Education Level & Time)

model_two_way <- aov(median_weekly_earnings ~ education_level * year, data = earnings_df)

summary(model_two_way)
                     Df  Sum Sq Mean Sq  F value   Pr(>F)    
education_level       2 4505007 2252503 4339.900  < 2e-16 ***
year                  6  578857   96476  185.881  < 2e-16 ***
education_level:year 12   30826    2569    4.949 1.54e-05 ***
Residuals            57   29584     519                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Bachelor’s degree holders consistently earn much more than those with some college/associate degrees, who in turn earn more than those with only a high school diploma.

#Games-Howell Test
library(rstatix)

Attaching package: 'rstatix'
The following object is masked from 'package:stats':

    filter
earnings_df |> games_howell_test(median_weekly_earnings ~ education_level)
# A tibble: 3 × 8
  .y.            group1 group2 estimate conf.low conf.high    p.adj p.adj.signif
* <chr>          <chr>  <chr>     <dbl>    <dbl>     <dbl>    <dbl> <chr>       
1 median_weekly… ba_on… high_…    -559.   -625.      -492. 1.03e-12 ****        
2 median_weekly… ba_on… some_…    -440.   -507.      -372. 1.04e-12 ****        
3 median_weekly… high_… some_…     119.     68.1      170. 2.37e- 6 ****        

ROI Analysis

limited_earnings_df <- earnings_df[earnings_df$education_level != "high_school",] 

limited_earnings_df <- limited_earnings_df |> 
  mutate(degree_category = case_when(
    education_level == "ba_only" ~ "Bachelor's",education_level == "some_college_assoc" ~ "Vocational",
    TRUE ~ "NA"
  ))

tuition_df <- tuition_df |> 
  mutate(degree_category = case_when(
    degree_granting == "Degree Granting" ~ "Bachelor's",
    degree_granting == "Non-Degree Granting" ~ "Vocational",
    TRUE ~ "NA"
  ))

Join Dataframes together to assess ROI

temp_earnings_df <- limited_earnings_df |> select(c("median_weekly_earnings","year","degree_category"))
temp_tuition_df <- tuition_df |> select(c("average_tuition","year","degree_category"))

merged_df <- merge(temp_earnings_df, temp_tuition_df, by = c("degree_category","year"))

roi_df <- merged_df |> mutate( ROI = median_weekly_earnings / average_tuition )

roi_yearly <- roi_df %>%
  group_by(degree_category, year) %>%
  summarize(ROI = mean(ROI), .groups = "drop") %>%
  arrange(degree_category, year)

roi_clean <- merged_df |>
  dplyr::group_by(degree_category, year) |>
  dplyr::summarise(
    annual_earnings = mean(median_weekly_earnings) * 52,
    tuition  = mean(average_tuition),
    ROI      = annual_earnings / tuition,  # annualized ROI
    .groups = "drop"
  )
library(plotly)

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
plot <- plot_ly(roi_clean, x = ~year) %>%
  add_trace(
    y = ~ROI,
    color = ~degree_category,
    colors = c("Bachelor's" = "#E64B35", "Vocational" = "#4DBBD5"),
    type = "scatter",
    mode = "lines+markers",
    name = ~paste(degree_category, "ROI"),
    yaxis = "y"
  ) %>%

  add_trace(
    y = ~annual_earnings,
    x = ~year,
    split = ~degree_category,
    type = "bar",
    name = ~paste(degree_category, "Earnings"),
    opacity = 0.4,
    yaxis = "y2",
    showlegend = TRUE
  ) %>%

  add_trace(
    y = ~tuition,
    x = ~year,
    split = ~degree_category,
    type = "bar",
    name = ~paste(degree_category, "Tuition"),
    opacity = 0.4,
    yaxis = "y2",
    showlegend = TRUE
  ) %>%
  layout(
    title = "ROI with Earnings and Tuition Over Time",
    barmode = "group",
    xaxis = list(title = "Year"),
    yaxis = list(
      title = "ROI (Earnings / Tuition)",
      rangemode = "tozero"
    ),
    yaxis2 = list(
      title = "Dollars (Earnings & Tuition)",
      overlaying = "y",
      side = "right",
      rangemode = "tozero"
    ),
    legend = list(title = list(text = "<b>Series</b>"))
  )

# htmlwidgets::saveWidget(plot, "roi_plot.html")

plot

Time Series Analysis Using Employment Population Ratio

emp_ratio_df$date <- as.Date(emp_ratio_df$date, "%Y-%m-%d")
emp_ratio_df$employment_population_ratio <- as.numeric(emp_ratio_df$employment_population_ratio)
max <- max(emp_ratio_df$employment_population_ratio)

fig <- plot_ly(
  emp_ratio_df,
  x = ~date,
  y = ~employment_population_ratio,
  color = ~education_level,        
  colors = c(
    "ba_plus" = "#E64B35",
    "some_college_assoc" = "#4DBBD5",
    "high_school" = "#00A087"
  ),
  type = 'scatter',
  mode = 'lines'
)

fig <- fig |>
  layout(
    title = "Employment Population Ratio (2019–2025)",
    xaxis = list(title = "Date"),
    yaxis = list(title = "Employment-Population Ratio"),
    legend = list(title = list(text = "<b>Education Level</b>"))
    # shapes = list(
    #   list(type = 'rect',
    #        x0 = "2019-12-01", x1 = "2025-12-01",
    #        y0 = 40, y1 = max,
    #        line = list(color = 'rgba(0, 0, 255, 0.5)', width = 2),
    #        fillcolor = 'rgba(0, 0, 255, 0.2)')
    # ),
    # annotations = list(
    #   list(
    #     x = "2020-06-01",
    #     y = max + 2,
    #     text = "Onset of COVID-19",
    #     showarrow = FALSE,
    #     font = list(color = 'black', size = 12)
    #   )
  # )
)


fig
unemp_df$date <- as.Date(unemp_df$date, "%Y-%m-%d")
unemp_df$unemployment_rate <- as.numeric(unemp_df$unemployment_rate)
max <- max(unemp_df$unemployment_rate)
min <- min(unemp_df$unemployment_rate)


fig <- plot_ly(
  unemp_df,
  x = ~date,
  y = ~unemp_df$unemployment_rate,
  color = ~education_level,        
  colors = c(
    "ba_only" = "#E64B35",
    "some_college_assoc" = "#4DBBD5",
    "high_school" = "#00A087"
  ),
  type = 'scatter',
  mode = 'lines'
)

fig <- fig |>
  layout(
    title = "Unemployment Rate (2019–2025)",
    xaxis = list(title = "Date"),
    yaxis = list(title = "Employment-Population Ratio"),
    legend = list(title = list(text = "<b>Education Level</b>"))
      # ,
    # shapes = list(
    #   list(type = 'rect',
    #        x0 = "2019-12-01", x1 = "2020-12-01",
    #        y0 = min-2, y1 = max,
    #        line = list(color = 'rgba(0, 0, 255, 0.5)', width = 2),
    #        fillcolor = 'rgba(0, 0, 255, 0.2)')
    # )
    # annotations = list(
    #   list(
    #     x = "2020-06-01",
    #     y = max + 2,
    #     text = "COVID-19",
    #     showarrow = FALSE,
    #     font = list(color = 'black', size = 12)
    #   )
  # )
)


fig

Convert to Time-Series Object

library(lubridate)

Attaching package: 'lubridate'
The following objects are masked from 'package:base':

    date, intersect, setdiff, union
vocational_unemp <- unemp_df[unemp_df$education_level == "some_college_assoc",]
###### Convert to Time Series Object ######
ts_unemp <- ts(vocational_unemp$unemployment_rate, start = c(2019, 1), frequency = 12)
library(TSstudio)
library(zoo) 

Attaching package: 'zoo'
The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
ts_lags(ts_unemp)

Overall, the lag plots do not show strong linear relationships across the higher order lags. Lag 1 shows the strongest upward pattern, however, it is not clearly linear. This suggests that this month’s unemployment rate for vocational grads is moderately correlated with last month’s unemployment rate. Lags 2 and 3 also show positive sloping patterns. However, after Lag 4, the plots begin to appear random.

This pattern indicates mild autocorrelation at short lags. After lags 3-4, there is no long-term autocorrelation or seasonality.

library(forecast)
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
ggAcf(ts_unemp, frequency=12)
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `frequency`

ggPacf(ts_unemp, frequency=12)
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `frequency`

The ACF plot shows strong auto-correlation at initial lags, indicating that last month’s unemployment rate is very predictive of this month’s unemployment rate.

The ACF plot also shows that vocational graduates’ unemployment rates are non-stationary, with values decaying slowly over time, rather than rapidly. This suggests that observations far in the past influence the current value. The PACF also indicates that current values depend heavily on the previous values, with a very large spike at lag 1.

tseries::adf.test(ts_unemp)

    Augmented Dickey-Fuller Test

data:  ts_unemp
Dickey-Fuller = -2.7152, Lag order = 4, p-value = 0.2829
alternative hypothesis: stationary

The P-value obtained from the ADF test is greater than 0.05. Therefore, we do not have enough evidence to reject the null hypothesis at 5% significance level. This indicates the data is non-stationary, which aligns with the ADF and PACF plots we obtained.

library(dplyr)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0     ✔ stringr 1.5.1
✔ purrr   1.0.2     ✔ tibble  3.2.1
✔ readr   2.1.5     ✔ tidyr   1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ plotly::filter() masks rstatix::filter(), dplyr::filter(), stats::filter()
✖ dplyr::lag()     masks stats::lag()
✖ car::recode()    masks dplyr::recode()
✖ purrr::some()    masks car::some()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
dc = decompose(ts_unemp)

df <- tibble(
  date = as.Date(time(ts_unemp)),
  data = dc$x,
  trend = dc$trend,
  seasonal = dc$seasonal,
  remainder = dc$random
)

df %>%
  pivot_longer(-date) %>%
  ggplot(aes(date, value)) +
  geom_line() +
  facet_wrap(~name, scales = "free_y", ncol = 1) +
  scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
  theme_minimal()
Warning: Removed 6 rows containing missing values or values outside the scale range
(`geom_line()`).

Seasonal: The seasonal component represents regular, repeating patterns throughout the year. While the plot appears to show a consistent yearly rise, followed by a dip, this pattern doesn’t seem to represent true seasonality. The lag plots and ACF and PACF plots we generated earlier do not show seasonality. The seasonal pattern shown in the classical decomposition likely is visualizing the assumption of a yearly seasonal structure rather than true seasonality in the unemployment rate.

Trend: The trend line represents the underlying pattern of unemployment over a longer period, excluding short-term fluctuations. The trend shows a rapid increase from 2019 to 2020, followed by the start of a steady decline in late 2020 that has continued through 2025. This very clearly shows the spike in unemployment as a result of COVID-19, and the economy’s slow recovery over time.

Remainder: The remainder component captures the random, unpredictable variations in the data not explained by the trend or seasonality. The remainder clearly shows the steep spike in vocational graduates’ unemployment rate at the start of 2020 due to the COVID-19 pandemic. After 2020, the remainder values decreases and stabilizes, showing that the shocks caused by COVID-19 eventually diminished once the economy began to transition into recovery.

library(patchwork)
diff_1 <- diff(ts_unemp)

diff_2 <- diff(ts_unemp, differences = 2)

acf_plot_1 <- ggAcf(diff_1,50) +
  ggtitle("ACF of First-Order Differenced Series") +
  theme_minimal()

acf_plot_2 <- ggAcf(diff_2,50) +
  ggtitle("ACF of Second-Order Differenced Series") +
  theme_minimal()

acf_plot_1/acf_plot_2